In [24]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Shortcuts

Numpy



In [5]:

    
## numpy array
a = np.array([1, 4, 6])
print a.shape
print 
print np.ones((3, 4))
print
print np.zeros((2, 5))
print
print np.arange(6).reshape(2, 3)
print
print a.T
print
print np.hstack([a, a])
print
print np.vstack([a, a])









    



(3L,)

[[ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]]

[[ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]]

[[0 1 2]
 [3 4 5]]

[1 4 6]

[1 4 6 1 4 6]

[[1 4 6]
 [1 4 6]]



In [6]:

    
## element wise or matrix multiplication
print np.dot(a, a)  # or a.dot(a)
print 
print a*a



In [7]:

    
# you can convert a 1-d array to a 2-d array with np.newaxis
print 'a:'
print a
print 'a.shape:', a.shape
print 
print 'a[np.newaxis] is a 2-d row vector:'
print a[np.newaxis]
print 'a[np.newaxis].shape:', a[np.newaxis].shape
print

print 'a[np.newaxis].T: is a 2-d column vector:'
print a[np.newaxis].T
print 'a[np.newaxis].T.shape:', a[np.newaxis].T.shape
print









    



a:
[1 4 6]
a.shape: (3L,)

a[np.newaxis] is a 2-d row vector:
[[1 4 6]]
a[np.newaxis].shape: (1L, 3L)

a[np.newaxis].T: is a 2-d column vector:
[[1]
 [4]
 [6]]
a[np.newaxis].T.shape: (3L, 1L)



In [8]:

    
# numpy provides a ton of other functions for working with matrices
m = np.array([[1, 2],[3, 4]])
m_inverse = np.linalg.inv(m)
print 'inverse of [[1, 2],[3, 4]]:'
print m_inverse
print

print 'm.dot(m_inverse):'
print m.dot(m_inverse)









    



inverse of [[1, 2],[3, 4]]:
[[-2.   1. ]
 [ 1.5 -0.5]]

m.dot(m_inverse):
[[  1.00000000e+00   1.11022302e-16]
 [  0.00000000e+00   1.00000000e+00]]



In [9]:

    
# and for doing all kinds of sciency type stuff.  like generating random numbers:
np.random.seed(5678)
n = np.random.randn(3, 4)
print 'a matrix with random entries drawn from a Normal(0, 1) distribution:'
print n









    



a matrix with random entries drawn from a Normal(0, 1) distribution:
[[-0.70978938 -0.01719118  0.31941137 -2.26533107]
 [-1.37745366  1.94998073 -0.56381007 -0.84373759]
 [ 0.22453858 -0.39137772  0.60550347 -0.68615034]]



In [10]:

    
np.random.seed(3333)
n_data = 10 # number of data points. i.e. N
n_dim = 5   # number of dimensions of each datapoint.  i.e. D

betas = np.random.randn(n_dim + 1)

X_no_constant = np.random.randn(n_data, n_dim)
print 'X_no_constant:'
print X_no_constant
print 

# INSERT YOUR CODE HERE!
X = np.hstack([np.ones(n_data)[np.newaxis].T, X_no_constant])
y = np.dot(X, betas)

# Tests:
y_expected = np.array([-0.41518357, -9.34696153, 5.08980544, 
                       -0.26983873, -1.47667864, 1.96580794, 
                       6.87009791, -2.07784135, -0.7726816, 
                       -2.74954984])
np.testing.assert_allclose(y, y_expected)
print '****** Tests passed! ******'









    



X_no_constant:
[[-0.92232935  0.27352359 -0.86339625  1.43766044 -1.71379871]
 [ 0.179322   -0.89138595  2.13005603  0.51898975 -0.41875106]
 [ 0.34010119 -1.07736609 -1.02314142 -1.02518535  0.40972072]
 [ 1.18883814  1.01044759  0.3108216  -1.17868611 -0.49526331]
 [-1.50248369 -0.196458    0.34752922 -0.79200465 -0.31534705]
 [ 1.73245191 -1.42793626 -0.94376587  0.86823495 -0.95946769]
 [-1.07074604 -0.06555247 -2.17689578  1.58538804  1.81492637]
 [-0.73706088  0.77546031  0.42653908 -0.51853723 -0.53045538]
 [ 1.09620536 -0.69557321  0.03080082  0.25219596 -0.35304303]
 [-0.93971165  0.04448078  0.04273069  0.4961477  -1.7673568 ]]

****** Tests passed! ******

Pandas



In [11]:

    
b = np.array([[6, 7], [3, 1], [4, 0]])
df = pd.DataFrame(data=b,  columns=['Weight', 'Height'])
print 'b:'
print b
print 
print 'DataFame version of b:'
print df
print









    



b:
[[6 7]
 [3 1]
 [4 0]]

DataFame version of b:
   Weight  Height
0       6       7
1       3       1
2       4       0



In [12]:

    
baseball = pd.read_csv('data/baseball.dat.txt')



In [16]:

    
# baseball.head()
# baseball.describe()
# baseball.keys()
# baseball.info()



In [18]:

    
millionaire_indices = baseball['Salary'] > 1000
# you can use the query indices to look at a subset of your original dataframe
print 'baseball.shape:', baseball.shape
print "baseball[millionaire_indices].shape:", baseball[millionaire_indices].shape
baseball[millionaire_indices][['Salary', 'AVG', 'Runs', 'Name']].head()









    



baseball.shape: (337, 18)
baseball[millionaire_indices].shape: (139, 18)






    Out[18]:






  
    
      
      Salary
      AVG
      Runs
      Name
    
  
  
    
      0
      3300
      0.272
      69
      Andre Dawson
    
    
      1
      2600
      0.269
      58
      Steve Buchele
    
    
      2
      2500
      0.249
      54
      Kal Daniels
    
    
      3
      2475
      0.260
      59
      Shawon Dunston
    
    
      4
      2313
      0.273
      87
      Mark Grace



In [19]:

    
shoe_size_df = pd.read_csv('data/baseball2.dat.txt')
shoe_size_df.shape









    Out[19]:





(3, 2)



In [20]:

    
merged = pd.merge(baseball, shoe_size_df, on=['Name'])
merged









    Out[20]:






  
    
      
      Salary
      AVG
      OBP
      Runs
      Hits
      Doubles
      Triples
      HR
      RBI
      Walks
      SO
      SB
      Errs
      free agency eligibility
      free agent in 1991/2
      arbitration eligibility
      arbitration in 1991/2
      Name
      Shoe Size
    
  
  
    
      0
      3300
      0.272
      0.302
      69
      153
      21
      4
      31
      104
      22
      80
      4
      3
      1
      0
      0
      0
      Andre Dawson
      11
    
    
      1
      2313
      0.273
      0.346
      87
      169
      28
      5
      8
      58
      70
      53
      3
      8
      0
      0
      1
      0
      Mark Grace
      13
    
    
      2
      200
      0.203
      0.240
      39
      64
      10
      1
      10
      33
      14
      96
      13
      6
      0
      0
      0
      0
      Sammy Sosa
      12



In [23]:

    
merged_outer = pd.merge(baseball, shoe_size_df, on=['Name'], how='outer')
merged_outer.head()









    Out[23]:






  
    
      
      Salary
      AVG
      OBP
      Runs
      Hits
      Doubles
      Triples
      HR
      RBI
      Walks
      SO
      SB
      Errs
      free agency eligibility
      free agent in 1991/2
      arbitration eligibility
      arbitration in 1991/2
      Name
      Shoe Size
    
  
  
    
      0
      3300
      0.272
      0.302
      69
      153
      21
      4
      31
      104
      22
      80
      4
      3
      1
      0
      0
      0
      Andre Dawson
      11
    
    
      1
      2600
      0.269
      0.335
      58
      111
      17
      2
      18
      66
      39
      69
      0
      3
      1
      1
      0
      0
      Steve Buchele
      NaN
    
    
      2
      2500
      0.249
      0.337
      54
      115
      15
      1
      17
      73
      63
      116
      6
      5
      1
      0
      0
      0
      Kal Daniels
      NaN
    
    
      3
      2475
      0.260
      0.292
      59
      128
      22
      7
      12
      50
      23
      64
      21
      21
      0
      0
      1
      0
      Shawon Dunston
      NaN
    
    
      4
      2313
      0.273
      0.346
      87
      169
      28
      5
      8
      58
      70
      53
      3
      8
      0
      0
      1
      0
      Mark Grace
      13

Plotting with Matplotlib



In [25]:

    
baseball = pd.read_csv('data/baseball.dat.txt')



In [26]:

    
f = plt.figure()
plt.hist(baseball['Hits'], bins=15)   # plot or scatter
plt.xlabel('Number of Hits')
plt.ylabel('Frequency')
plt.title('Histogram of Number of Hits')
f.set_size_inches(10, 5)
plt.show()

Sci-Kit Learn Linear Regression



In [31]:

    
from sklearn import linear_model



In [32]:

    
## linear regression models
model_lr = linear_model.LinearRegression()
model_ridge = linear_model.Ridge(alpha=1)
model_lasso = linear_model.Lasso(alpha=1)
model_en = linear_model.ElasticNet(alpha=0.5, l1_ratio=0.1)



In [27]:

    
def mean_squared_error(y_true, y_pred):
    """
    calculate the mean_squared_error given a vector of true ys and a vector of predicted ys
    """
    diff = y_true - y_pred
    return np.dot(diff, diff) / len(diff)

def predict_test_values(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    return model.predict(X_test)
    

def calc_train_and_test_error(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    return mean_squared_error(y_train, y_pred_train), mean_squared_error(y_test, y_pred_test)



In [33]:

    
# load overfitting data
with np.load('data/overfitting_data.npz') as data:
    x_train = data['x_train']
    y_train = data['y_train']
    x_test = data['x_test']
    y_test = data['y_test']



In [34]:

    
## Model performance

print "Linear Regression Training and Test Errors:"
print calc_train_and_test_error(model_lr, x_train, y_train, x_test, y_test)
print

print "Ridge Regression Training and Test Errors:"
print calc_train_and_test_error(model_ridge, x_train, y_train, x_test, y_test)
print

print "Lasso Regression Training and Test Errors:"
print calc_train_and_test_error(model_lasso, x_train, y_train, x_test, y_test)
print

print 'ElasticNet Training and Test Errors:'
print calc_train_and_test_error(model_en, x_train, y_train, x_test, y_test)
print









    



Linear Regression Training and Test Errors:
(2.4835421623899702e-05, 283.52728792173116)

Ridge Regression Training and Test Errors:
(0.018634112597992421, 9.5641560683730305)

Lasso Regression Training and Test Errors:
(4.1142351854727677, 4.6028697944107098)

ElasticNet Errors:
(1.9616145613107794, 3.8189893038857918)



In [35]:

    
n_disp_coefs = 10

print 'Linear Regression Coefficients:'
print model_lr.coef_[:n_disp_coefs]
print

print 'Ridge Regression Coefficients:'
print model_ridge.coef_[:n_disp_coefs]
print

print 'LASSO Coefficients:'
print model_lasso.coef_[:n_disp_coefs]
print

print 'ElasticNet Coefficients:'
print model_en.coef_[:n_disp_coefs]
print









    



Linear Regression Coefficients:
[  5.22757470e-01   2.78289824e+00   4.04383818e+00   1.17544241e+00
   3.13230537e-01  -1.28127160e-01   5.11682173e-01   3.83754833e-03
  -1.19481096e+00   9.56448172e-01]

Ridge Regression Coefficients:
[ 1.01611626  1.77246927  3.06534773 -0.0333898   0.04378713  0.10472107
 -0.13445823  0.12656315  0.05779722  0.10204281]

LASSO Coefficients:
[ 0.03375129  0.92694409  1.92659636  0.          0.          0.         -0.
  0.          0.          0.        ]

ElasticNet Coefficients:
[ 0.61034977  1.16675401  1.79600624  0.          0.          0.00686607
  0.          0.02027936  0.00469244  0.00644604]



In [36]:

    
print "Sum of Linear Regression Coefficients:"
print np.sum(np.abs(model_lr.coef_))
print

print "Sum of Ridge Regression Coefficients:"
print np.sum(np.abs(model_ridge.coef_))
print

print "Sum of Lasso Regression Coefficients:"
print np.sum(np.abs(model_lasso.coef_))
print

print 'Sum of ElasticNet Coefficients'
print np.sum(np.abs(model_en.coef_))
print









    



Sum of Linear Regression Coefficients:
338.387469048

Sum of Ridge Regression Coefficients:
62.4912904062

Sum of Lasso Regression Coefficients:
2.88729174216

Sum of ElasticNet Coefficients
9.82525057342

Model Selection

Types of Cross Validation

Validation Set Cross Validation



In [37]:

    
# a helper function for performing validation set cross validation
from sklearn.cross_validation import train_test_split
validation_portion = 0.1
seed = 1234
x_train_small, x_valid, y_train_small, y_valid = \
    train_test_split(x_train, y_train, test_size=validation_portion, random_state=seed)

print 'Original Training Set Size:'
print x_train.shape, y_train.shape
print

print 'Reducted Training Set Size:'
print x_train_small.shape, y_train_small.shape
print

print 'Validation Set Size:'
print x_valid.shape, y_valid.shape
print









    



Original Training Set Size:
(600L, 598L) (600L,)

Reducted Training Set Size:
(540L, 598L) (540L,)

Validation Set Size:
(60L, 598L) (60L,)



In [38]:

    
def validation_set_error(model, x_train, y_train, validation_portion=0.1, seed=1234):
    # FILL IN YOUR CODE HERE

    x_train_small, x_valid, y_train_small, y_valid = \
        train_test_split(x_train, y_train, test_size=validation_portion, random_state=seed)
    model.fit(x_train_small, y_train_small)
    y_pred_valid = model.predict(x_valid)
    return mean_squared_error(y_valid, y_pred_valid)
      
    
# set up models
model_lr_valid = linear_model.LinearRegression()
model_ridge_valid = linear_model.Ridge(alpha=10)

# calculate errors
valid_portion = .1
n_seeds = 5
print "Linear Regression Training and Test Errors:"
# FILL IN YOUR CODE HERE
print calc_train_and_test_error(model_lr_valid, x_train_small, y_train_small, x_test, y_test)

print
print "Linear Regression Validation Errors:"
# FILL IN YOUR CODE HERE
print validation_set_error(model_lr_valid, x_train, y_train, validation_portion=0.1, seed=1234)
print 

for seed in range(n_seeds):
    print validation_set_error(model_lr_valid, x_train, y_train, validation_portion=valid_portion, seed=seed)
    print

print "Ridge Regression Training and Test Errors:"
# FILL IN YOUR CODE HERE
print calc_train_and_test_error(model_ridge_valid, x_train_small, y_train_small, x_test, y_test)


print
print "Ridge Regression Validation Errors:"
# FILL IN YOUR CODE HERE
print validation_set_error(model_ridge_valid, x_train, y_train, validation_portion=0.1, seed=1234)
print 

for seed in range(n_seeds):
    print validation_set_error(model_ridge_valid, x_train, y_train, validation_portion=valid_portion, seed=seed)
    print









    



Linear Regression Training and Test Errors:
(6.5894013208313341e-28, 9.6373710755996189)

Linear Regression Validation Errors:
9.36759564041

10.4039988935

11.6352333478

8.8241606146

9.20945551949

7.60088829288

Ridge Regression Training and Test Errors:
(0.037116269305341815, 4.8163269566646871)

Ridge Regression Validation Errors:
4.44120540399

3.61817500364

7.12476980873

5.32580668571

5.74292650031

4.6239411424

K-Fold Cross Validation



In [42]:

    
# scikit learn provides a useful object to help you perform kfold cross validation
from sklearn.cross_validation import KFold

n_data = len(y_train)
fold_count = 0
for train_reduced_row_ids, valid_row_ids in KFold(n_data, n_folds=4):
    print
    print 
    print "FOLD %d:" % fold_count
    print "-------"
    print("train_ids:\n%s\n\nvalid_ids\n%s" % (train_reduced_row_ids, valid_row_ids))
    x_train_reduced = x_train[train_reduced_row_ids]
    y_train_reduced = y_train[train_reduced_row_ids]
    x_valid = x_train[valid_row_ids]
    y_valid = y_train[valid_row_ids]
    fold_count += 1



In [43]:

    
# NOTE: KFolds isn't random at all.  It's important to shuffle your data first before using it. 
from sklearn.utils import shuffle
x_train_shuffled, y_train_shuffled = shuffle(x_train, y_train)



In [44]:

    
def kfold_error(model, x_train, y_train, k=4, seed=1234):
    # FILL IN YOUR CODE HERE
    
    # shuffle training data
    x_train_shuffled, y_train_shuffled = shuffle(x_train, y_train, random_state=seed)
    
    n_data = len(y_train)
    error_sum = 0
    for train_reduced_row_ids, valid_row_ids in KFold(n_data, n_folds=k):
        x_train_reduced = x_train_shuffled[train_reduced_row_ids]
        y_train_reduced = y_train_shuffled[train_reduced_row_ids]
        x_valid = x_train_shuffled[valid_row_ids]
        y_valid = y_train_shuffled[valid_row_ids]
        model.fit(x_train_reduced, y_train_reduced)
        y_valid_pred = model.predict(x_valid)
        error_sum += mean_squared_error(y_valid, y_valid_pred)
    return error_sum*1.0 / k
    

# set up models
model_lr_valid = linear_model.LinearRegression()
model_ridge_valid = linear_model.Ridge(alpha=10)

# calculate errors
n_seeds = 3
k = 5

print "Linear Regression Training and Test Errors:"
# FILL IN YOUR CODE HERE
print calc_train_and_test_error(model_lr_valid, x_train, y_train, x_test, y_test)

print
print "Linear Regression K-Fold Errors:"
# FILL IN YOUR CODE HERE
print 
for seed in range(n_seeds):
    print kfold_error(model_lr_valid, x_train, y_train, k=k, seed=seed)
    print 

print
print "Ridge Regression Training and Test Errors:"
# FILL IN YOUR CODE HERE
print calc_train_and_test_error(model_ridge_valid, x_train, y_train, x_test, y_test)


print
print "Ridge Regression K-Fold Errors:"
# FILL IN YOUR CODE HERE
print 
for seed in range(n_seeds):
    print kfold_error(model_ridge_valid, x_train, y_train, k=k, seed=seed)
    print









    



Linear Regression Training and Test Errors:
(2.4835421623899702e-05, 283.52728792173116)

Linear Regression K-Fold Errors:

7.21045028087

7.3510411941

6.69216918868


Ridge Regression Training and Test Errors:
(0.064063243432624289, 4.9205415455726982)

Ridge Regression K-Fold Errors:

5.77769677178

5.78170553945

5.6587338965

Model and Hyperparameter Selection with Cross Validation



In [45]:

    
def model_name(model):
    s = model.__str__().lower()
    if "linearregression" in s:
        return 'LinearRegression'
    elif "lasso" in s:
        return 'Lasso(a=%g)' % model.alpha
    elif "ridge" in s:
        return 'Ridge(a=%g)' % model.alpha
    elif "elastic" in s:
        return 'ElasticNet(a=%g, r=%g)' % (model.alpha, model.l1_ratio)
    else:
        raise ValueError("Unknown Model Type")

def create_models(alphas=(.01, .03, .1, .3, 1, 3), l1_ratios=(.7, .5, .3)):
    models = [linear_model.LinearRegression()]
    models.extend([linear_model.Ridge(a) for a in alphas])
    models.extend([linear_model.Lasso(a) for a in alphas])
    models.extend([linear_model.ElasticNet(a, l1_ratio=l) for a in alphas for l in l1_ratios])
    return models

def results_df(models, betas_true, x_train, y_train, x_test, y_test, k=4):
    n_data, n_dim = x_train.shape

    n_zeros = n_dim - len(betas_true)
    
    betas_true = np.concatenate([betas_true, np.zeros(n_zeros)])
    
    # fit models to training data
    [m.fit(x_train, y_train) for m in models]
    
    betas = np.vstack([betas_true] + [m.coef_ for m in models])
    beta_names = ['Beta ' + str(i) for i in range(n_dim)]

    # set up model names
    model_names =  ["True Coefs"] + [model_name(m) for m in models]
    df = pd.DataFrame(data=betas, columns=beta_names, index=model_names)

    # calculate training errors
    y_preds = [m.predict(x_train) for m in models]
    errors = [np.nan] + [mean_squared_error(y_train, y_pred) for y_pred in y_preds]
    df['Train Error'] = errors

    # calculate validation errors
    errors = [np.nan] + [kfold_error(m, x_train, y_train, k=k) for m in models]
    df['Cross Validation Error'] = errors

    # calculate test errors
    y_preds = [m.predict(x_test) for m in models]
    errors = [np.nan] + [mean_squared_error(y_test, y_pred) for y_pred in y_preds]
    df['Test Error'] = errors

    return df


# these are some of the magic parameters that I used to actually 
# generate the overfitting dataset
n_dim = 598
n_dim_meaningful = 3
n_dim_disp_extra = 2

# the actual betas used to generate the y values.  the rest were 0.
betas_true = np.arange(n_dim_meaningful) + 1

# create a whole bunch of untrained models
models = create_models(alphas=(.01, .03, .1, .3, 1), l1_ratios=(.9, .7, .5))

# 
all_results = results_df(models, betas_true, x_train, y_train, x_test, y_test, k=4)

# decide which columns we want to display
disp_cols = ["Beta " + str(i) for i in range(n_dim_meaningful + n_dim_disp_extra)] 
disp_cols += ['Train Error', 'Cross Validation Error', 'Test Error']

# display the results
all_results[disp_cols]









    Out[45]:






  
    
      
      Beta 0
      Beta 1
      Beta 2
      Beta 3
      Beta 4
      Train Error
      Cross Validation Error
      Test Error
    
  
  
    
      True Coefs
      1.000000
      2.000000
      3.000000
      0.000000
      0.000000
      NaN
      NaN
      NaN
    
    
      LinearRegression
      0.522757
      2.782898
      4.043838
      1.175442
      0.313231
      0.000025
      6.550726
      8.569427
    
    
      Ridge(a=0.01)
      0.867059
      2.290546
      3.729941
      0.570987
      0.380292
      0.001033
      6.590630
      8.592261
    
    
      Ridge(a=0.03)
      1.028546
      2.023949
      3.548358
      0.237767
      0.370161
      0.002749
      6.588766
      8.590441
    
    
      Ridge(a=0.1)
      1.088696
      1.847044
      3.386562
      0.012841
      0.272410
      0.005693
      6.582288
      8.584100
    
    
      Ridge(a=0.3)
      1.065763
      1.788885
      3.247394
      -0.050784
      0.142292
      0.010186
      6.564170
      8.566235
    
    
      Ridge(a=1)
      1.016116
      1.772469
      3.065348
      -0.033390
      0.043787
      0.018634
      6.504991
      8.506474
    
    
      Lasso(a=0.01)
      1.076240
      1.956283
      2.955116
      0.000000
      0.025530
      0.213388
      1.701747
      1.849997
    
    
      Lasso(a=0.03)
      1.042435
      1.941418
      2.952131
      0.000000
      0.002460
      0.526780
      1.223279
      1.196381
    
    
      Lasso(a=0.1)
      0.972258
      1.869852
      2.892761
      -0.000000
      0.000000
      0.968836
      1.024546
      0.895046
    
    
      Lasso(a=0.3)
      0.764523
      1.659750
      2.677197
      0.000000
      0.000000
      1.235803
      1.264843
      1.120443
    
    
      Lasso(a=1)
      0.033751
      0.926944
      1.926596
      0.000000
      0.000000
      4.114235
      4.166927
      4.301203
    
    
      ElasticNet(a=0.01, r=0.9)
      1.073393
      1.951365
      2.947436
      0.000000
      0.027348
      0.195976
      1.760720
      1.935114
    
    
      ElasticNet(a=0.01, r=0.7)
      1.066012
      1.936668
      2.931448
      0.000000
      0.031150
      0.163034
      1.919308
      2.167257
    
    
      ElasticNet(a=0.01, r=0.5)
      1.046440
      1.907626
      2.908069
      0.000000
      0.029275
      0.127897
      2.191965
      2.583608
    
    
      ElasticNet(a=0.03, r=0.9)
      1.042974
      1.931661
      2.936235
      0.000000
      0.008043
      0.484044
      1.261913
      1.240859
    
    
      ElasticNet(a=0.03, r=0.7)
      1.038972
      1.908386
      2.901951
      0.000000
      0.014723
      0.401288
      1.361827
      1.360261
    
    
      ElasticNet(a=0.03, r=0.5)
      1.026654
      1.885132
      2.862618
      0.000000
      0.020993
      0.316085
      1.547012
      1.592804
    
    
      ElasticNet(a=0.1, r=0.9)
      0.971808
      1.860572
      2.872529
      -0.000000
      0.000000
      0.960364
      1.041134
      0.906070
    
    
      ElasticNet(a=0.1, r=0.7)
      0.968971
      1.841385
      2.828162
      -0.000000
      0.000000
      0.913444
      1.092529
      0.967566
    
    
      ElasticNet(a=0.1, r=0.5)
      0.961790
      1.813670
      2.765941
      0.000000
      0.000000
      0.807426
      1.215067
      1.092389
    
    
      ElasticNet(a=0.3, r=0.9)
      0.771897
      1.639918
      2.624446
      0.000000
      0.000000
      1.277930
      1.307354
      1.168137
    
    
      ElasticNet(a=0.3, r=0.7)
      0.785336
      1.603633
      2.528307
      0.000000
      0.000000
      1.370249
      1.400441
      1.273228
    
    
      ElasticNet(a=0.3, r=0.5)
      0.797276
      1.571245
      2.442910
      0.000000
      0.000000
      1.469114
      1.512212
      1.390285
    
    
      ElasticNet(a=1, r=0.9)
      0.125708
      0.934637
      1.835475
      0.000000
      0.000000
      4.112881
      4.189787
      4.313581
    
    
      ElasticNet(a=1, r=0.7)
      0.265489
      0.946058
      1.698143
      0.000000
      0.000000
      4.168660
      4.233218
      4.410287
    
    
      ElasticNet(a=1, r=0.5)
      0.366721
      0.954115
      1.599575
      0.000000
      0.000000
      4.252089
      4.309107
      4.534828



In [46]:

    
# scikit learn includes some functions for making cross validation easier 
# and computationally faster for a some models
from sklearn import linear_model
model_ridge_cv = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
model_lasso_cv = linear_model.LassoCV(alphas=[0.1, 1.0, 10.0])
model_en_cv = linear_model.ElasticNetCV(l1_ratio=[.9], n_alphas=100)



In [47]:



In [ ]:

	Salary	AVG	Runs	Name
0	3300	0.272	69	Andre Dawson
1	2600	0.269	58	Steve Buchele
2	2500	0.249	54	Kal Daniels
3	2475	0.260	59	Shawon Dunston
4	2313	0.273	87	Mark Grace

	Salary	AVG	OBP	Runs	Hits	Doubles	Triples	HR	RBI	Walks	SO	SB	Errs	free agency eligibility	arbitration eligibility	Name	Shoe Size
0	3300	0.272	0.302	69	153	21	4	31	104	22	80	4	3	1	0	Andre Dawson	11
1	2313	0.273	0.346	87	169	28	5	8	58	70	53	3	8	0	1	Mark Grace	13
2	200	0.203	0.240	39	64	10	1	10	33	14	96	13	6	0	0	Sammy Sosa	12

	Beta 0	Beta 1	Beta 2	Beta 3	Beta 4	Train Error	Cross Validation Error	Test Error
True Coefs	1.000000	2.000000	3.000000	0.000000	0.000000	NaN	NaN	NaN
LinearRegression	0.522757	2.782898	4.043838	1.175442	0.313231	0.000025	6.550726	8.569427
Ridge(a=0.01)	0.867059	2.290546	3.729941	0.570987	0.380292	0.001033	6.590630	8.592261
Ridge(a=0.03)	1.028546	2.023949	3.548358	0.237767	0.370161	0.002749	6.588766	8.590441
Ridge(a=0.1)	1.088696	1.847044	3.386562	0.012841	0.272410	0.005693	6.582288	8.584100
Ridge(a=0.3)	1.065763	1.788885	3.247394	-0.050784	0.142292	0.010186	6.564170	8.566235
Ridge(a=1)	1.016116	1.772469	3.065348	-0.033390	0.043787	0.018634	6.504991	8.506474
Lasso(a=0.01)	1.076240	1.956283	2.955116	0.000000	0.025530	0.213388	1.701747	1.849997
Lasso(a=0.03)	1.042435	1.941418	2.952131	0.000000	0.002460	0.526780	1.223279	1.196381
Lasso(a=0.1)	0.972258	1.869852	2.892761	-0.000000	0.000000	0.968836	1.024546	0.895046
Lasso(a=0.3)	0.764523	1.659750	2.677197	0.000000	0.000000	1.235803	1.264843	1.120443
Lasso(a=1)	0.033751	0.926944	1.926596	0.000000	0.000000	4.114235	4.166927	4.301203
ElasticNet(a=0.01, r=0.9)	1.073393	1.951365	2.947436	0.000000	0.027348	0.195976	1.760720	1.935114
ElasticNet(a=0.01, r=0.7)	1.066012	1.936668	2.931448	0.000000	0.031150	0.163034	1.919308	2.167257
ElasticNet(a=0.01, r=0.5)	1.046440	1.907626	2.908069	0.000000	0.029275	0.127897	2.191965	2.583608
ElasticNet(a=0.03, r=0.9)	1.042974	1.931661	2.936235	0.000000	0.008043	0.484044	1.261913	1.240859
ElasticNet(a=0.03, r=0.7)	1.038972	1.908386	2.901951	0.000000	0.014723	0.401288	1.361827	1.360261
ElasticNet(a=0.03, r=0.5)	1.026654	1.885132	2.862618	0.000000	0.020993	0.316085	1.547012	1.592804
ElasticNet(a=0.1, r=0.9)	0.971808	1.860572	2.872529	-0.000000	0.000000	0.960364	1.041134	0.906070
ElasticNet(a=0.1, r=0.7)	0.968971	1.841385	2.828162	-0.000000	0.000000	0.913444	1.092529	0.967566
ElasticNet(a=0.1, r=0.5)	0.961790	1.813670	2.765941	0.000000	0.000000	0.807426	1.215067	1.092389
ElasticNet(a=0.3, r=0.9)	0.771897	1.639918	2.624446	0.000000	0.000000	1.277930	1.307354	1.168137
ElasticNet(a=0.3, r=0.7)	0.785336	1.603633	2.528307	0.000000	0.000000	1.370249	1.400441	1.273228
ElasticNet(a=0.3, r=0.5)	0.797276	1.571245	2.442910	0.000000	0.000000	1.469114	1.512212	1.390285
ElasticNet(a=1, r=0.9)	0.125708	0.934637	1.835475	0.000000	0.000000	4.112881	4.189787	4.313581
ElasticNet(a=1, r=0.7)	0.265489	0.946058	1.698143	0.000000	0.000000	4.168660	4.233218	4.410287
ElasticNet(a=1, r=0.5)	0.366721	0.954115	1.599575	0.000000	0.000000	4.252089	4.309107	4.534828